import torchtext

import torch
import jieba

STOP_WORDS = []
stop_word_path = 'data/baidu_stopwords.txt'
stop_word_path = 'data/cn_stopwords.txt'
with open(stop_word_path,encoding='utf-8') as F:
    for line in F:
        STOP_WORDS.append(line.strip())

print(STOP_WORDS)
STOP_WORDS = []
def tokenize(string):
    doc = list(jieba.cut(string, cut_all=False))
    print(doc)
    return [word for word in doc if word not in STOP_WORDS]  # 分词后标准化并且去掉停止词

if __name__ == "__main__":
    string = '最近在安邦长青树中看到什么豁免，这个是什么意思？'

    res = tokenize(string)
    print(res)
